package com.zzb.framework.utils;

import com.alibaba.fastjson.JSONArray;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * html提取简介
 * Created by zhangzhenbin on 17-3-21.
 */
public class HtmlUtils {
    static final String CHARSET_STRING = "charset";
    /**从html内容提取简介*/
    public static String getSummary(String content, int length) {
        Document doc = Jsoup.parse(content);
        String text = doc.text();
        return StringUtils.substring(text, 0, length);
    }
    /**去除html标签*/
    public static String htmlFilter(String html){
        return html.replace("/<[^>]+>/g","");
    }
    /**从html内容提取标题图片*/
    public static String getTitleImg(String content) {
        Pattern pattern = Pattern.compile("<img\\s+(?:[^>]*)src\\s*=\\s*([^>]+)", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
        Matcher matcher = pattern.matcher(content);
        List<String> imgs = new ArrayList<>();
        while (matcher.find()) {
            String group = matcher.group(1);
            if (group == null) {
                continue;
            }
            //   这里可能还需要更复杂的判断,用以处理src="...."内的一些转义符
            if (group.startsWith("'")) {
                imgs.add(group.substring(1, group.indexOf("'", 1)));
            } else if (group.startsWith("\"")) {
                imgs.add(group.substring(1, group.indexOf("\"", 1)));
            } else {
                imgs.add(group.split("\\s")[0]);
            }
        }
        JSONArray imgsJson = new JSONArray();
        imgsJson.addAll(imgs);
        return imgsJson.toJSONString();
    }

    private String getEnc(String file) {//根据正则匹配得到页面编码
        String enc = "utf-8";
        Pattern p = Pattern.compile("(charset|Charset|CHARSET)\\s*=\\s*\"?\\s*([-\\w]*?)[^-\\w]");
        Matcher m = p.matcher(file);
        if (m.find()) {
            enc = m.group(2);
        }
        return enc;
    }

    public static void main(String[] args) {
        String content = "<html><li><p>呵呵</p>我是超人!<img src = \"/dsaf/dsa/qwerd/sa/s/qerqf.jpg\"/><img src = \"/dsaf/dsa/fd/sa/fds/adsaf.jpg\"/></li><li>pqmda噢诶我去发了的萨芬</li><html>";
        System.out.println(getTitleImg(content));
    }
}
